Множественная проверка гипотез



In [2]:

    
import pandas as pd
import numpy as np

from scipy.stats import pearsonr
from statsmodels.sandbox.stats.multicomp import multipletests

Foodmart product sales



In [3]:

    
sales = pd.read_csv('foodmart.sales.tsv', sep = '\t', header = 0, parse_dates = [2])



In [4]:

    
sales.head()



In [5]:

    
products = pd.read_csv('foodmart.products.tsv', sep = '\t', header = 0)



In [6]:

    
products.head()









    Out[6]:






  
    
      
      product_class_id
      product_id
      brand_name
      product_name
      SKU
      SRP
      gross_weight
      net_weight
      recyclable_package
      low_fat
      units_per_case
      cases_per_pallet
      shelf_width
      shelf_height
      shelf_depth
    
  
  
    
      0
      30
      1
      Washington
      Washington Berry Juice
      90748583674
      2.85
      8.39
      6.39
      False
      False
      30
      14
      16.9
      12.60
      7.40
    
    
      1
      52
      2
      Washington
      Washington Mango Drink
      96516502499
      0.74
      7.42
      4.42
      False
      True
      18
      8
      13.4
      3.71
      22.60
    
    
      2
      52
      3
      Washington
      Washington Strawberry Drink
      58427771925
      0.83
      13.10
      11.10
      True
      True
      17
      13
      14.4
      11.00
      7.77
    
    
      3
      19
      4
      Washington
      Washington Cream Soda
      64412155747
      3.64
      10.60
      9.60
      True
      False
      26
      10
      22.9
      18.90
      7.93
    
    
      4
      19
      5
      Washington
      Washington Diet Soda
      85561191439
      2.19
      6.66
      4.65
      True
      False
      7
      10
      20.7
      21.90
      19.20



In [7]:

    
sales = sales.merge(products[['product_id', 'product_name']], 
                    on = ['product_id'], how = 'inner')



In [8]:

    
sales.head()









    Out[8]:






  
    
      
      product_id
      store_id
      date
      sales
      product_name
    
  
  
    
      0
      4
      6
      1997-01-01
      4
      Washington Cream Soda
    
    
      1
      4
      7
      1997-01-05
      3
      Washington Cream Soda
    
    
      2
      4
      6
      1997-01-06
      2
      Washington Cream Soda
    
    
      3
      4
      17
      1997-01-11
      2
      Washington Cream Soda
    
    
      4
      4
      24
      1997-01-11
      2
      Washington Cream Soda

Корреляция Пирсона



In [9]:

    
sparse_sales = pd.pivot_table(sales, values='sales', index=['date', 'store_id'],
                     columns=['product_name'], fill_value = 0)



In [10]:

    
sparse_sales.head()









    Out[10]:






  
    
      
      product_name
      ADJ Rosy Sunglasses
      Akron City Map
      Akron Eyeglass Screwdriver
      American Beef Bologna
      American Chicken Hot Dogs
      American Cole Slaw
      American Corned Beef
      American Foot-Long Hot Dogs
      American Low Fat Bologna
      American Low Fat Cole Slaw
      ...
      Washington Apple Juice
      Washington Berry Juice
      Washington Cola
      Washington Cranberry Juice
      Washington Cream Soda
      Washington Diet Cola
      Washington Diet Soda
      Washington Mango Drink
      Washington Orange Juice
      Washington Strawberry Drink
    
    
      date
      store_id
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      1997-01-01
      6
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      4
      0
      0
      0
      0
      0
    
    
      14
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      1997-01-02
      11
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      23
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      1997-01-03
      7
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      4
      0
      0
      0
      0
      0
      0
      0
      0
    
  

5 rows × 1559 columns



In [11]:

    
%%time 
corr_data = []

for i, lhs_column in enumerate(sparse_sales.columns):
    for j, rhs_column in enumerate(sparse_sales.columns):
        if i >= j:
            continue
        
        corr, p = pearsonr(sparse_sales[lhs_column], sparse_sales[rhs_column])
        corr_data.append([lhs_column, rhs_column, corr, p])









    



Wall time: 1min 33s



In [12]:

    
sales_correlation = pd.DataFrame.from_records(corr_data)
sales_correlation.columns = ['product_A', 'product_B', 'corr', 'p']



In [13]:

    
sales_correlation.head()









    Out[13]:






  
    
      
      product_A
      product_B
      corr
      p
    
  
  
    
      0
      ADJ Rosy Sunglasses
      Akron City Map
      0.076608
      0.032414
    
    
      1
      ADJ Rosy Sunglasses
      Akron Eyeglass Screwdriver
      -0.006581
      0.854396
    
    
      2
      ADJ Rosy Sunglasses
      American Beef Bologna
      0.038685
      0.280546
    
    
      3
      ADJ Rosy Sunglasses
      American Chicken Hot Dogs
      0.041105
      0.251529
    
    
      4
      ADJ Rosy Sunglasses
      American Cole Slaw
      -0.045887
      0.200484

Сколько гипотез об отсутствии корреляции отвергается без поправки на множественную проверку?



In [14]:

    
(sales_correlation.p < 0.05).value_counts()









    Out[14]:





False    982453
True     232008
Name: p, dtype: int64

Поправка на множественную проверку

Метод Холма



In [15]:

    
reject, p_corrected, a1, a2 = multipletests(sales_correlation.p, 
                                            alpha = 0.05, 
                                            method = 'holm')



In [16]:

    
sales_correlation['p_corrected'] = p_corrected
sales_correlation['reject'] = reject



In [17]:

    
sales_correlation.head()









    Out[17]:






  
    
      
      product_A
      product_B
      corr
      p
      p_corrected
      reject
    
  
  
    
      0
      ADJ Rosy Sunglasses
      Akron City Map
      0.076608
      0.032414
      1
      False
    
    
      1
      ADJ Rosy Sunglasses
      Akron Eyeglass Screwdriver
      -0.006581
      0.854396
      1
      False
    
    
      2
      ADJ Rosy Sunglasses
      American Beef Bologna
      0.038685
      0.280546
      1
      False
    
    
      3
      ADJ Rosy Sunglasses
      American Chicken Hot Dogs
      0.041105
      0.251529
      1
      False
    
    
      4
      ADJ Rosy Sunglasses
      American Cole Slaw
      -0.045887
      0.200484
      1
      False



In [18]:

    
sales_correlation.reject.value_counts()









    Out[18]:





False    1212733
True        1728
Name: reject, dtype: int64



In [19]:

    
sales_correlation[sales_correlation.reject == True].sort_values(by='corr', ascending=False)









    Out[19]:






  
    
      
      product_A
      product_B
      corr
      p
      p_corrected
      reject
    
  
  
    
      1063670
      Just Right Vegetable Soup
      Plato French Roast Coffee
      0.340598
      1.226033e-22
      1.488970e-16
      True
    
    
      885574
      Great Muffins
      Nationeel Grape Fruit Roll
      0.322176
      2.688803e-20
      3.265443e-14
      True
    
    
      473067
      Club Low Fat Cottage Cheese
      Skinner Strawberry Drink
      0.306701
      1.883995e-18
      2.288034e-12
      True
    
    
      1181001
      Robust Monthly Home Magazine
      Tri-State Lemons
      0.303269
      4.674973e-18
      5.677558e-12
      True
    
    
      1160248
      Pleasant Regular Ramen Soup
      Shady Lake Ravioli
      0.298502
      1.619119e-17
      1.966350e-11
      True
    
    
      885261
      Great Muffins
      High Quality Copper Cleaner
      0.297160
      2.287551e-17
      2.778130e-11
      True
    
    
      1151585
      Plato Salt
      Tri-State Onions
      0.291224
      1.032464e-16
      1.253881e-10
      True
    
    
      1092548
      Mighty Good Monthly Home Magazine
      Tell Tale Baby Onion
      0.290458
      1.250831e-16
      1.519077e-10
      True
    
    
      678154
      Even Better Buttermilk
      Sunset Toilet Paper
      0.290046
      1.386446e-16
      1.683774e-10
      True
    
    
      914945
      Hermanos Macintosh Apples
      Plato French Roast Coffee
      0.289095
      1.757608e-16
      2.134531e-10
      True
    
    
      296176
      Booker Blueberry Yogurt
      Fast Strawberry Fruit Roll
      0.288495
      2.040104e-16
      2.477606e-10
      True
    
    
      1195806
      Sunset Glass Cleaner
      Tell Tale Green Pepper
      0.287523
      2.595168e-16
      3.151701e-10
      True
    
    
      778504
      Faux Products Mint Mouthwash
      Skinner Cream Soda
      0.285697
      4.069434e-16
      4.942121e-10
      True
    
    
      45043
      BBB Best Apple Butter
      Carrington Frozen Carrots
      0.284404
      5.584481e-16
      6.782062e-10
      True
    
    
      490841
      Colony Pumpernickel Bread
      Imagine Frozen Broccoli
      0.284278
      5.759605e-16
      6.994735e-10
      True
    
    
      922125
      Hermanos Plums
      Tell Tale Summer Squash
      0.284008
      6.151860e-16
      7.471101e-10
      True
    
    
      812401
      Fort West Raisins
      Sunset AA-Size Batteries
      0.281877
      1.031856e-15
      1.253133e-09
      True
    
    
      834549
      Golden Blueberry Waffles
      Super Regular Coffee
      0.280110
      1.579078e-15
      1.917702e-09
      True
    
    
      87152
      BBB Best Vegetable Oil
      Tri-State Elephant Garlic
      0.279151
      1.986864e-15
      2.412933e-09
      True
    
    
      623896
      Dollar Monthly Home Magazine
      Portsmouth Chardonnay Wine
      0.278492
      2.325036e-15
      2.823621e-09
      True
    
    
      98754
      Best Choice Cheese Dip
      Nationeel Fudge Brownies
      0.276663
      3.589792e-15
      4.359591e-09
      True
    
    
      1132103
      PigTail Beef TV Dinner
      Tell Tale Asparagus
      0.273964
      6.774349e-15
      8.227040e-09
      True
    
    
      1148273
      Plato French Roast Coffee
      Tell Tale Macintosh Apples
      0.273905
      6.869042e-15
      8.342032e-09
      True
    
    
      423539
      Carrington Frozen Chicken Breast
      Red Wing Screw Driver
      0.273705
      7.196552e-15
      8.739766e-09
      True
    
    
      364300
      CDR Extra Chunky Peanut Butter
      Imagine Frozen Sausage Pizza
      0.273332
      7.851595e-15
      9.535267e-09
      True
    
    
      60735
      BBB Best Decaf Coffee
      Gulf Coast Bubble Gum
      0.273142
      8.207575e-15
      9.967575e-09
      True
    
    
      353311
      CDR Apple Preserves
      Cutting Edge Potato Salad
      0.273096
      8.296192e-15
      1.007519e-08
      True
    
    
      921798
      Hermanos Plums
      Nationeel Graham Crackers
      0.272966
      8.552863e-15
      1.038689e-08
      True
    
    
      777975
      Faux Products Mint Mouthwash
      Hermanos Macintosh Apples
      0.272103
      1.045324e-14
      1.269477e-08
      True
    
    
      641278
      Ebony Firm Tofu
      Sunset Counter Cleaner
      0.270295
      1.588138e-14
      1.928686e-08
      True
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      513582
      Consolidated Extra Moisture Shampoo
      Lake Corned Beef
      0.195186
      3.896997e-08
      4.726133e-02
      True
    
    
      436952
      Carrington Ice Cream
      Horatio Raspberry Fruit Roll
      0.195180
      3.900854e-08
      4.730807e-02
      True
    
    
      847133
      Golden Frozen Sausage Pizza
      Landslide Apple Butter
      0.195165
      3.910777e-08
      4.742837e-02
      True
    
    
      933970
      High Quality 75 Watt Lightbulb
      Plato French Roast Coffee
      0.195161
      3.913360e-08
      4.745966e-02
      True
    
    
      44287
      Atomic White Chocolate Bar
      Modell Muffins
      0.195138
      3.928076e-08
      4.763809e-02
      True
    
    
      505488
      Consolidated Apricot Shampoo
      Pleasant Turkey Noodle Soup
      0.195116
      3.941970e-08
      4.780656e-02
      True
    
    
      790723
      Fort West Cheese Crackers
      Top Measure Chardonnay
      0.195099
      3.952766e-08
      4.793744e-02
      True
    
    
      733912
      Fast Beef Jerky
      Good Light Wine
      0.195064
      3.975083e-08
      4.820805e-02
      True
    
    
      1159301
      Pleasant Fancy Canned Sardines
      Steady Mint Mouthwash
      0.195059
      3.978462e-08
      4.824900e-02
      True
    
    
      927407
      Hermanos Summer Squash
      Super Strawberry Preserves
      0.195031
      3.996668e-08
      4.846975e-02
      True
    
    
      815542
      Fort West Sesame Crackers
      Horatio Fudge Cookies
      0.194993
      4.021331e-08
      4.876881e-02
      True
    
    
      725774
      Fantastic English Muffins
      Tell Tale Mandarin Oranges
      0.194990
      4.023652e-08
      4.879692e-02
      True
    
    
      771540
      Faux Products Childrens Cold Remedy
      Horatio Chocolate Chip Cookies
      0.194986
      4.025788e-08
      4.882279e-02
      True
    
    
      509641
      Consolidated Conditioning Shampoo
      Footnote Extra Lean Hamburger
      0.194986
      4.026006e-08
      4.882538e-02
      True
    
    
      532932
      Cormorant C-Size Batteries
      Washington Strawberry Drink
      0.194982
      4.028907e-08
      4.886053e-02
      True
    
    
      511031
      Consolidated Deodorant
      High Quality Soft Napkins
      0.194981
      4.029434e-08
      4.886688e-02
      True
    
    
      1097081
      Modell Wheat Bread
      Washington Apple Juice
      0.194977
      4.031823e-08
      4.889581e-02
      True
    
    
      1077587
      Landslide Corn Oil
      Plato Chunky Peanut Butter
      0.194963
      4.041312e-08
      4.901084e-02
      True
    
    
      338688
      Bravo Fancy Canned Oysters
      Carlson String Cheese
      0.194958
      4.044278e-08
      4.904678e-02
      True
    
    
      1125069
      Nationeel Potato Chips
      Tell Tale Canned Peanuts
      0.194947
      4.051404e-08
      4.913316e-02
      True
    
    
      1031716
      Imagine Frozen Pepperoni Pizza
      Tri-State Prepared Salad
      0.194947
      4.051638e-08
      4.913596e-02
      True
    
    
      35031
      Atomic Malted Milk Balls
      Just Right Vegetable Soup
      0.194939
      4.056627e-08
      4.919642e-02
      True
    
    
      3896
      Akron Eyeglass Screwdriver
      Hermanos Macintosh Apples
      0.194929
      4.063596e-08
      4.928090e-02
      True
    
    
      1101432
      Moms Potato Salad
      National Small Brown Eggs
      0.194920
      4.069757e-08
      4.935558e-02
      True
    
    
      209481
      Big Time Grape Popsicles
      Pleasant Vegetable Soup
      0.194915
      4.072982e-08
      4.939464e-02
      True
    
    
      662127
      Ebony Potatos
      Just Right Fancy Canned Anchovies
      0.194914
      4.073740e-08
      4.940380e-02
      True
    
    
      565740
      Cutting Edge Cole Slaw
      Horatio Potato Chips
      0.194875
      4.099170e-08
      4.971215e-02
      True
    
    
      615865
      Discover Ravioli
      High Quality Soft Napkins
      0.194869
      4.103075e-08
      4.975947e-02
      True
    
    
      559150
      Cormorant Toilet Bowl Cleaner
      Plato Chunky Peanut Butter
      0.194863
      4.107241e-08
      4.980995e-02
      True
    
    
      149050
      Better Beef Soup
      Pleasant Canned Yams
      0.194856
      4.112254e-08
      4.987070e-02
      True
    
  

1728 rows × 6 columns

Метод Бенджамини-Хохберга



In [20]:

    
reject, p_corrected, a1, a2 = multipletests(sales_correlation.p, 
                                            alpha = 0.05, 
                                            method = 'fdr_bh')



In [21]:

    
sales_correlation['p_corrected'] = p_corrected
sales_correlation['reject'] = reject



In [22]:

    
sales_correlation.head()









    Out[22]:






  
    
      
      product_A
      product_B
      corr
      p
      p_corrected
      reject
    
  
  
    
      0
      ADJ Rosy Sunglasses
      Akron City Map
      0.076608
      0.032414
      0.203716
      False
    
    
      1
      ADJ Rosy Sunglasses
      Akron Eyeglass Screwdriver
      -0.006581
      0.854396
      0.956078
      False
    
    
      2
      ADJ Rosy Sunglasses
      American Beef Bologna
      0.038685
      0.280546
      0.630699
      False
    
    
      3
      ADJ Rosy Sunglasses
      American Chicken Hot Dogs
      0.041105
      0.251529
      0.600790
      False
    
    
      4
      ADJ Rosy Sunglasses
      American Cole Slaw
      -0.045887
      0.200484
      0.541916
      False



In [23]:

    
sales_correlation.reject.value_counts()









    Out[23]:





False    1138407
True       76054
Name: reject, dtype: int64



In [24]:

    
sales_correlation[sales_correlation.reject == True].sort_values(by='corr', ascending=False)









    Out[24]:






  
    
      
      product_A
      product_B
      corr
      p
      p_corrected
      reject
    
  
  
    
      1063670
      Just Right Vegetable Soup
      Plato French Roast Coffee
      0.340598
      1.226033e-22
      1.488970e-16
      True
    
    
      885574
      Great Muffins
      Nationeel Grape Fruit Roll
      0.322176
      2.688803e-20
      1.632723e-14
      True
    
    
      473067
      Club Low Fat Cottage Cheese
      Skinner Strawberry Drink
      0.306701
      1.883995e-18
      7.626793e-13
      True
    
    
      1181001
      Robust Monthly Home Magazine
      Tri-State Lemons
      0.303269
      4.674973e-18
      1.419393e-12
      True
    
    
      1160248
      Pleasant Regular Ramen Soup
      Shady Lake Ravioli
      0.298502
      1.619119e-17
      3.932713e-12
      True
    
    
      885261
      Great Muffins
      High Quality Copper Cleaner
      0.297160
      2.287551e-17
      4.630236e-12
      True
    
    
      1151585
      Plato Salt
      Tri-State Onions
      0.291224
      1.032464e-16
      1.791267e-11
      True
    
    
      1092548
      Mighty Good Monthly Home Magazine
      Tell Tale Baby Onion
      0.290458
      1.250831e-16
      1.870872e-11
      True
    
    
      678154
      Even Better Buttermilk
      Sunset Toilet Paper
      0.290046
      1.386446e-16
      1.870872e-11
      True
    
    
      914945
      Hermanos Macintosh Apples
      Plato French Roast Coffee
      0.289095
      1.757608e-16
      2.134546e-11
      True
    
    
      296176
      Booker Blueberry Yogurt
      Fast Strawberry Fruit Roll
      0.288495
      2.040104e-16
      2.252388e-11
      True
    
    
      1195806
      Sunset Glass Cleaner
      Tell Tale Green Pepper
      0.287523
      2.595168e-16
      2.626442e-11
      True
    
    
      778504
      Faux Products Mint Mouthwash
      Skinner Cream Soda
      0.285697
      4.069434e-16
      3.801669e-11
      True
    
    
      45043
      BBB Best Apple Butter
      Carrington Frozen Carrots
      0.284404
      5.584481e-16
      4.663210e-11
      True
    
    
      490841
      Colony Pumpernickel Bread
      Imagine Frozen Broccoli
      0.284278
      5.759605e-16
      4.663210e-11
      True
    
    
      922125
      Hermanos Plums
      Tell Tale Summer Squash
      0.284008
      6.151860e-16
      4.669496e-11
      True
    
    
      812401
      Fort West Raisins
      Sunset AA-Size Batteries
      0.281877
      1.031856e-15
      7.371465e-11
      True
    
    
      834549
      Golden Blueberry Waffles
      Super Regular Coffee
      0.280110
      1.579078e-15
      1.065405e-10
      True
    
    
      87152
      BBB Best Vegetable Oil
      Tri-State Elephant Garlic
      0.279151
      1.986864e-15
      1.269983e-10
      True
    
    
      623896
      Dollar Monthly Home Magazine
      Portsmouth Chardonnay Wine
      0.278492
      2.325036e-15
      1.411833e-10
      True
    
    
      98754
      Best Choice Cheese Dip
      Nationeel Fudge Brownies
      0.276663
      3.589792e-15
      2.076030e-10
      True
    
    
      1132103
      PigTail Beef TV Dinner
      Tell Tale Asparagus
      0.273964
      6.774349e-15
      3.627036e-10
      True
    
    
      1148273
      Plato French Roast Coffee
      Tell Tale Macintosh Apples
      0.273905
      6.869042e-15
      3.627036e-10
      True
    
    
      423539
      Carrington Frozen Chicken Breast
      Red Wing Screw Driver
      0.273705
      7.196552e-15
      3.641638e-10
      True
    
    
      364300
      CDR Extra Chunky Peanut Butter
      Imagine Frozen Sausage Pizza
      0.273332
      7.851595e-15
      3.709685e-10
      True
    
    
      60735
      BBB Best Decaf Coffee
      Gulf Coast Bubble Gum
      0.273142
      8.207575e-15
      3.709685e-10
      True
    
    
      353311
      CDR Apple Preserves
      Cutting Edge Potato Salad
      0.273096
      8.296192e-15
      3.709685e-10
      True
    
    
      921798
      Hermanos Plums
      Nationeel Graham Crackers
      0.272966
      8.552863e-15
      3.709685e-10
      True
    
    
      777975
      Faux Products Mint Mouthwash
      Hermanos Macintosh Apples
      0.272103
      1.045324e-14
      4.377606e-10
      True
    
    
      641278
      Ebony Firm Tofu
      Sunset Counter Cleaner
      0.270295
      1.588138e-14
      6.429108e-10
      True
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      503962
      Consolidated Angled Toothbrush
      Hilltop Multi-Symptom Cold Remedy
      0.105676
      3.127646e-03
      4.996220e-02
      True
    
    
      685978
      Even Better Low Fat Sour Cream
      Imagine Frozen Sausage Pizza
      0.105676
      3.127694e-03
      4.996220e-02
      True
    
    
      189763
      Big Time Fajita French Fries
      Tell Tale Tomatos
      0.105675
      3.127756e-03
      4.996220e-02
      True
    
    
      1083389
      Landslide Pepper
      Shady Lake Rice Medly
      0.105675
      3.127760e-03
      4.996220e-02
      True
    
    
      899101
      Hermanos Baby Onion
      Lake Pimento Loaf
      0.105675
      3.127797e-03
      4.996220e-02
      True
    
    
      1210319
      Tip Top Lox
      Tri-State Limes
      0.105675
      3.127831e-03
      4.996220e-02
      True
    
    
      819313
      Framton Eyeglass Screwdriver
      Nationeel Grape Fruit Roll
      0.105675
      3.127870e-03
      4.996220e-02
      True
    
    
      272090
      Blue Label Creamed Corn
      Modell Bagels
      0.105675
      3.127971e-03
      4.996315e-02
      True
    
    
      1013449
      Horatio Low Fat Chips
      Plato Tomato Sauce
      0.105674
      3.128012e-03
      4.996315e-02
      True
    
    
      452542
      Choice Mint Chocolate Bar
      Cormorant Paper Towels
      0.105674
      3.128079e-03
      4.996333e-02
      True
    
    
      1073971
      Landslide Apple Jam
      Red Spade Foot-Long Hot Dogs
      0.105674
      3.128105e-03
      4.996333e-02
      True
    
    
      1202407
      Super Regular Coffee
      Tell Tale Mushrooms
      0.105673
      3.128419e-03
      4.996769e-02
      True
    
    
      319623
      Booker Whole Milk
      PigTail Lime Popsicles
      0.105672
      3.128675e-03
      4.997112e-02
      True
    
    
      863249
      Good Light Wine
      Landslide Strawberry Jam
      0.105671
      3.129087e-03
      4.997703e-02
      True
    
    
      832433
      Golden Apple Cinnamon Waffles
      Landslide Canola Oil
      0.105670
      3.129285e-03
      4.997955e-02
      True
    
    
      184296
      Big Time Beef TV Dinner
      Carlson String Cheese
      0.105669
      3.129490e-03
      4.998204e-02
      True
    
    
      205949
      Big Time Frozen Pepperoni Pizza
      Even Better String Cheese
      0.105669
      3.129564e-03
      4.998204e-02
      True
    
    
      244890
      Bird Call Laundry Detergent
      Tri-State Mushrooms
      0.105669
      3.129583e-03
      4.998204e-02
      True
    
    
      703436
      Excellent Cola
      Monarch Manicotti
      0.105669
      3.129606e-03
      4.998204e-02
      True
    
    
      254265
      Black Tie City Map
      PigTail Frozen Sausage Pizza
      0.105667
      3.130062e-03
      4.998866e-02
      True
    
    
      997510
      Hilltop Toothpaste
      Tell Tale Broccoli
      0.105667
      3.130214e-03
      4.999043e-02
      True
    
    
      463130
      Club Blueberry Yogurt
      PigTail Blueberry Waffles
      0.105666
      3.130379e-03
      4.999242e-02
      True
    
    
      1025556
      Imagine Frozen Carrots
      Thresher Bubble Gum
      0.105666
      3.130446e-03
      4.999283e-02
      True
    
    
      7211
      American Chicken Hot Dogs
      Just Right Canned Peas
      0.105665
      3.130594e-03
      4.999454e-02
      True
    
    
      282728
      Blue Label Rice Soup
      Gulf Coast Mint Chocolate Bar
      0.105665
      3.130693e-03
      4.999516e-02
      True
    
    
      108018
      Best Choice Dried Dates
      Tell Tale Party Nuts
      0.105665
      3.130716e-03
      4.999516e-02
      True
    
    
      419799
      Carrington Frozen Carrots
      Sphinx Muffins
      0.105664
      3.130854e-03
      4.999649e-02
      True
    
    
      1184578
      Skinner Diet Cola
      Special Wheat Puffs
      0.105664
      3.130881e-03
      4.999649e-02
      True
    
    
      821999
      Gauss Monthly Fashion Magazine
      PigTail Beef TV Dinner
      0.105664
      3.130969e-03
      4.999724e-02
      True
    
    
      484896
      Colony Bagels
      Lake Roasted Chicken
      0.105664
      3.131013e-03
      4.999728e-02
      True
    
  

76054 rows × 6 columns

Корреляционный метод



In [ ]:

    
%%time 
corr_data = []

for i, lhs_column in enumerate(sparse_sales.columns):
    for j, rhs_column in enumerate(sparse_sales.columns):
        if i >= j:
            continue
        
        corr, p = pearsonr(sparse_sales[lhs_column], sparse_sales[rhs_column])
        prod = (sparse_sales[lhs_column]-np.mean(sparse_sales[lhs_column])) * (sparse_sales[rhs_column]-np.mean(sparse_sales[rhs_column]))
        theta = np.mean((prod - np.cov(sparse_sales)[1,2])**2)
        T = sum(prod) / np.sqrt(n * theta)
        corr_data.append([lhs_column, rhs_column, corr, p, T])



In [48]:

    
sales_correlation = pd.DataFrame.from_records(corr_data)
sales_correlation.columns = ['product_A', 'product_B', 'corr', 'p', 'T']



In [ ]:

    
alpha = 1
k = df.shape[1]
a = 2 * np.log(np.log(k))
b = np.sqrt(4 * np.log(p) - a)

	product_id	store_id	date	sales
0	4	6	1997-01-01	4
1	25	6	1997-01-01	3
2	48	6	1997-01-01	3
3	76	6	1997-01-01	4
4	119	6	1997-01-01	3

	product_class_id	product_id	brand_name	product_name	SKU	SRP	gross_weight	net_weight	recyclable_package	low_fat	units_per_case	cases_per_pallet	shelf_width	shelf_height	shelf_depth
0	30	1	Washington	Washington Berry Juice	90748583674	2.85	8.39	6.39	False	False	30	14	16.9	12.60	7.40
1	52	2	Washington	Washington Mango Drink	96516502499	0.74	7.42	4.42	False	True	18	8	13.4	3.71	22.60
2	52	3	Washington	Washington Strawberry Drink	58427771925	0.83	13.10	11.10	True	True	17	13	14.4	11.00	7.77
3	19	4	Washington	Washington Cream Soda	64412155747	3.64	10.60	9.60	True	False	26	10	22.9	18.90	7.93
4	19	5	Washington	Washington Diet Soda	85561191439	2.19	6.66	4.65	True	False	7	10	20.7	21.90	19.20

	product_id	store_id	date	sales	product_name
0	4	6	1997-01-01	4	Washington Cream Soda
1	4	7	1997-01-05	3	Washington Cream Soda
2	4	6	1997-01-06	2	Washington Cream Soda
3	4	17	1997-01-11	2	Washington Cream Soda
4	4	24	1997-01-11	2	Washington Cream Soda

	product_name	ADJ Rosy Sunglasses	Akron City Map	Akron Eyeglass Screwdriver	American Beef Bologna	American Chicken Hot Dogs	American Cole Slaw	American Corned Beef	American Foot-Long Hot Dogs	American Low Fat Bologna	American Low Fat Cole Slaw	...	Washington Apple Juice	Washington Berry Juice	Washington Cola	Washington Cranberry Juice	Washington Cream Soda	Washington Diet Cola	Washington Diet Soda	Washington Mango Drink	Washington Orange Juice	Washington Strawberry Drink
date	store_id
1997-01-01	6	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	4	0	0	0	0	0
1997-01-01	14	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
1997-01-02	11	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
1997-01-02	23	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
1997-01-03	7	0	0	0	0	0	0	0	0	0	0	...	0	4	0	0	0	0	0	0	0	0

	product_A	product_B	corr	p
0	ADJ Rosy Sunglasses	Akron City Map	0.076608	0.032414
1	ADJ Rosy Sunglasses	Akron Eyeglass Screwdriver	-0.006581	0.854396
2	ADJ Rosy Sunglasses	American Beef Bologna	0.038685	0.280546
3	ADJ Rosy Sunglasses	American Chicken Hot Dogs	0.041105	0.251529
4	ADJ Rosy Sunglasses	American Cole Slaw	-0.045887	0.200484

	product_A	product_B	corr	p	p_corrected	reject
1063670	Just Right Vegetable Soup	Plato French Roast Coffee	0.340598	1.226033e-22	1.488970e-16	True
885574	Great Muffins	Nationeel Grape Fruit Roll	0.322176	2.688803e-20	3.265443e-14	True
473067	Club Low Fat Cottage Cheese	Skinner Strawberry Drink	0.306701	1.883995e-18	2.288034e-12	True
1181001	Robust Monthly Home Magazine	Tri-State Lemons	0.303269	4.674973e-18	5.677558e-12	True
1160248	Pleasant Regular Ramen Soup	Shady Lake Ravioli	0.298502	1.619119e-17	1.966350e-11	True
885261	Great Muffins	High Quality Copper Cleaner	0.297160	2.287551e-17	2.778130e-11	True
1151585	Plato Salt	Tri-State Onions	0.291224	1.032464e-16	1.253881e-10	True
1092548	Mighty Good Monthly Home Magazine	Tell Tale Baby Onion	0.290458	1.250831e-16	1.519077e-10	True
678154	Even Better Buttermilk	Sunset Toilet Paper	0.290046	1.386446e-16	1.683774e-10	True
914945	Hermanos Macintosh Apples	Plato French Roast Coffee	0.289095	1.757608e-16	2.134531e-10	True
296176	Booker Blueberry Yogurt	Fast Strawberry Fruit Roll	0.288495	2.040104e-16	2.477606e-10	True
1195806	Sunset Glass Cleaner	Tell Tale Green Pepper	0.287523	2.595168e-16	3.151701e-10	True
778504	Faux Products Mint Mouthwash	Skinner Cream Soda	0.285697	4.069434e-16	4.942121e-10	True
45043	BBB Best Apple Butter	Carrington Frozen Carrots	0.284404	5.584481e-16	6.782062e-10	True
490841	Colony Pumpernickel Bread	Imagine Frozen Broccoli	0.284278	5.759605e-16	6.994735e-10	True
922125	Hermanos Plums	Tell Tale Summer Squash	0.284008	6.151860e-16	7.471101e-10	True
812401	Fort West Raisins	Sunset AA-Size Batteries	0.281877	1.031856e-15	1.253133e-09	True
834549	Golden Blueberry Waffles	Super Regular Coffee	0.280110	1.579078e-15	1.917702e-09	True
87152	BBB Best Vegetable Oil	Tri-State Elephant Garlic	0.279151	1.986864e-15	2.412933e-09	True
623896	Dollar Monthly Home Magazine	Portsmouth Chardonnay Wine	0.278492	2.325036e-15	2.823621e-09	True
98754	Best Choice Cheese Dip	Nationeel Fudge Brownies	0.276663	3.589792e-15	4.359591e-09	True
1132103	PigTail Beef TV Dinner	Tell Tale Asparagus	0.273964	6.774349e-15	8.227040e-09	True
1148273	Plato French Roast Coffee	Tell Tale Macintosh Apples	0.273905	6.869042e-15	8.342032e-09	True
423539	Carrington Frozen Chicken Breast	Red Wing Screw Driver	0.273705	7.196552e-15	8.739766e-09	True
364300	CDR Extra Chunky Peanut Butter	Imagine Frozen Sausage Pizza	0.273332	7.851595e-15	9.535267e-09	True
60735	BBB Best Decaf Coffee	Gulf Coast Bubble Gum	0.273142	8.207575e-15	9.967575e-09	True
353311	CDR Apple Preserves	Cutting Edge Potato Salad	0.273096	8.296192e-15	1.007519e-08	True
921798	Hermanos Plums	Nationeel Graham Crackers	0.272966	8.552863e-15	1.038689e-08	True
777975	Faux Products Mint Mouthwash	Hermanos Macintosh Apples	0.272103	1.045324e-14	1.269477e-08	True
641278	Ebony Firm Tofu	Sunset Counter Cleaner	0.270295	1.588138e-14	1.928686e-08	True
...	...	...	...	...	...	...
513582	Consolidated Extra Moisture Shampoo	Lake Corned Beef	0.195186	3.896997e-08	4.726133e-02	True
436952	Carrington Ice Cream	Horatio Raspberry Fruit Roll	0.195180	3.900854e-08	4.730807e-02	True
847133	Golden Frozen Sausage Pizza	Landslide Apple Butter	0.195165	3.910777e-08	4.742837e-02	True
933970	High Quality 75 Watt Lightbulb	Plato French Roast Coffee	0.195161	3.913360e-08	4.745966e-02	True
44287	Atomic White Chocolate Bar	Modell Muffins	0.195138	3.928076e-08	4.763809e-02	True
505488	Consolidated Apricot Shampoo	Pleasant Turkey Noodle Soup	0.195116	3.941970e-08	4.780656e-02	True
790723	Fort West Cheese Crackers	Top Measure Chardonnay	0.195099	3.952766e-08	4.793744e-02	True
733912	Fast Beef Jerky	Good Light Wine	0.195064	3.975083e-08	4.820805e-02	True
1159301	Pleasant Fancy Canned Sardines	Steady Mint Mouthwash	0.195059	3.978462e-08	4.824900e-02	True
927407	Hermanos Summer Squash	Super Strawberry Preserves	0.195031	3.996668e-08	4.846975e-02	True
815542	Fort West Sesame Crackers	Horatio Fudge Cookies	0.194993	4.021331e-08	4.876881e-02	True
725774	Fantastic English Muffins	Tell Tale Mandarin Oranges	0.194990	4.023652e-08	4.879692e-02	True
771540	Faux Products Childrens Cold Remedy	Horatio Chocolate Chip Cookies	0.194986	4.025788e-08	4.882279e-02	True
509641	Consolidated Conditioning Shampoo	Footnote Extra Lean Hamburger	0.194986	4.026006e-08	4.882538e-02	True
532932	Cormorant C-Size Batteries	Washington Strawberry Drink	0.194982	4.028907e-08	4.886053e-02	True
511031	Consolidated Deodorant	High Quality Soft Napkins	0.194981	4.029434e-08	4.886688e-02	True
1097081	Modell Wheat Bread	Washington Apple Juice	0.194977	4.031823e-08	4.889581e-02	True
1077587	Landslide Corn Oil	Plato Chunky Peanut Butter	0.194963	4.041312e-08	4.901084e-02	True
338688	Bravo Fancy Canned Oysters	Carlson String Cheese	0.194958	4.044278e-08	4.904678e-02	True
1125069	Nationeel Potato Chips	Tell Tale Canned Peanuts	0.194947	4.051404e-08	4.913316e-02	True
1031716	Imagine Frozen Pepperoni Pizza	Tri-State Prepared Salad	0.194947	4.051638e-08	4.913596e-02	True
35031	Atomic Malted Milk Balls	Just Right Vegetable Soup	0.194939	4.056627e-08	4.919642e-02	True
3896	Akron Eyeglass Screwdriver	Hermanos Macintosh Apples	0.194929	4.063596e-08	4.928090e-02	True
1101432	Moms Potato Salad	National Small Brown Eggs	0.194920	4.069757e-08	4.935558e-02	True
209481	Big Time Grape Popsicles	Pleasant Vegetable Soup	0.194915	4.072982e-08	4.939464e-02	True
662127	Ebony Potatos	Just Right Fancy Canned Anchovies	0.194914	4.073740e-08	4.940380e-02	True
565740	Cutting Edge Cole Slaw	Horatio Potato Chips	0.194875	4.099170e-08	4.971215e-02	True
615865	Discover Ravioli	High Quality Soft Napkins	0.194869	4.103075e-08	4.975947e-02	True
559150	Cormorant Toilet Bowl Cleaner	Plato Chunky Peanut Butter	0.194863	4.107241e-08	4.980995e-02	True
149050	Better Beef Soup	Pleasant Canned Yams	0.194856	4.112254e-08	4.987070e-02	True